# load raw data files
data <- read.csv("../data/filledDatabase.csv")[,-c(2:9,11:13)]

# clean data 
data <- clean_data(data) %>% collapse_data()

# separate compound and group_cate from the predictors
compound <- data$Compound
group_cat <- data$GroupCat
space_group <- data$SpaceGroup

# create data constructed by first 13 PC's
data <- select(data, -c("Compound","X","Z"))
# data_pca <- get_pc_space(data[,-1], k = 13) %>% scale() %>% data.frame()

# split data into 5 folds for cross validation later
folds <- caret::createFolds(1:nrow(data), k = 5, list = TRUE, returnTrain = FALSE)

Multinomial Regression

library(glmnet)
X = data[,-1] %>% dummy_cols(select_columns = "SpaceGroup") %>% select(-c(SpaceGroupNumber, SpaceGroup)) %>% as.matrix()
Y = data$GroupCat %>% as.matrix()

Shrinkage

Ridge

model_ridge <- glmnet(x = X, y = Y, alpha = 0, family = "multinomial")
plot(model_ridge, xvar = "lambda", label = TRUE)

LASSO

model_lasso <- glmnet(x = X, y = Y, alpha = 1, family = "multinomial")
plot(model_lasso, xvar = "lambda", label = TRUE)

Coefficient

Ridge

ridge_cv <- cv.glmnet(x = X, y = Y, alpha = 0, nfolds = 5, type.measure = "deviance", family = "multinomial") 
ridge_cv %>% 
  get_coef(tuning_parameter = ridge_cv$lambda.min) %>% 
  select(feature, Cubic, Tilted, Others) %>% 
  plot_coef()

LASSO

lasso_cv <- cv.glmnet(x = X, y = Y, alpha = 1, nfolds = 5, type.measure = "deviance", family = "multinomial")
lasso_cv %>% 
  get_coef(tuning_parameter = lasso_cv$lambda.min) %>% 
  select(feature, Cubic, Tilted, Others) %>% 
  plot_coef()

Elastic Net

library(caret)
elastic_cv <- 
  train(GroupCat ~., data = data, method = "glmnet",
    trControl = trainControl("cv", number = 5),
    tuneLength = 10
    )
elastic_cv$finalModel %>% 
  get_coef(tuning_parameter = elastic_cv$bestTune$lambda) %>% 
  select(feature, Cubic, Tilted, Others) %>% 
  plot_coef()

Accurate classification rate

Ridge

tb_ridge = prediction_table(alpha = 0, lambda = ridge_cv$lambda.min) 
tb_ridge$r %>% print_accurate_tb()
Fold1 Fold2 Fold3 Fold4 Fold5 Mean
0.9459459 0.9594595 0.96 0.9726027 0.9466667 0.956935
tb_ridge$t %>% highlight_tb_count()
Cubic Others Tilted
Cubic 178 2 1
Others 0 28 5
Tilted 1 7 149
Total 179 37 155
tb_ridge$t %>% highlight_tb_percent()
Cubic Others Tilted
Cubic 0.99 0.05 0.01
Others 0 0.76 0.03
Tilted 0.01 0.19 0.96
Total 100% 100% 100%
tb_ridge$t %>% 
  as.data.frame() %>% 
  arrange(desc(Freq))
##     Var1   Var2 Freq
## 1  Cubic  Cubic  178
## 2 Tilted Tilted  149
## 3 Others Others   28
## 4 Tilted Others    7
## 5 Others Tilted    5
## 6  Cubic Others    2
## 7 Tilted  Cubic    1
## 8  Cubic Tilted    1
## 9 Others  Cubic    0

LASSO

tb_lasso = prediction_table(alpha = 1, lambda = lasso_cv$lambda.min) 
tb_lasso$r %>% print_accurate_tb()
Fold1 Fold2 Fold3 Fold4 Fold5 Mean
0.9864865 0.9459459 0.96 0.9726027 0.9333333 0.9596737
tb_lasso$t %>% highlight_tb_count() 
Cubic Others Tilted
Cubic 178 1 0
Others 0 29 6
Tilted 1 7 149
Total 179 37 155
tb_lasso$t %>% highlight_tb_percent()
Cubic Others Tilted
Cubic 0.99 0.03 0
Others 0 0.78 0.04
Tilted 0.01 0.19 0.96
Total 100% 100% 100%

Elastic Net

tb_elastic = prediction_table(alpha = elastic_cv$bestTune[[1]], lambda = elastic_cv$bestTune[[2]]) 
tb_elastic$r %>% print_accurate_tb()
Fold1 Fold2 Fold3 Fold4 Fold5 Mean
0.9864865 0.972973 0.96 0.9589041 0.9466667 0.965006
tb_elastic$t %>% highlight_tb_count() 
Cubic Others Tilted
Cubic 178 1 0
Others 0 29 4
Tilted 1 7 151
Total 179 37 155
tb_elastic$t %>% highlight_tb_percent()
Cubic Others Tilted
Cubic 0.99 0.03 0
Others 0 0.78 0.03
Tilted 0.01 0.19 0.97
Total 100% 100% 100%